section .text
bits 64

; Win64 calling convention:
;	input parameters: rcx, rdx, r8, r9, [rsp+28h], [rsp+30h], ...
;	output parameter: rax
;	stack contains: [rsp] (8) return, [rsp+8h] (20h) "shadow space" to preserve registers
;	need to preserve registers: rbx, rsi, rdi, r12, r13, r14, r15, rbp
;	may destroy registers: rax, rcx, rdx, r8, r9, r10, r11

; Caller need to push parameters to the stack (if needed)
; and then allocate free space in stack of size 20h bytes (sub rsp,20h).

; Function must align stack to 16 bytes boundary (= do 1 push)
; and preserve registers rbx, rsi, rdi, r12, r13, r14, r15, rbp

; Note: Functions with short codes can be conditioned by SOFTASM flag.

; =============================================================================
;                          ADD uint256 (this += num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global AddU256A_x64

AddU256A_x64:
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		add		[rcx],rax
		adc		[rcx+8],r9
		
		mov		rax,[rdx+16]
		mov		r9,[rdx+24]
		adc		[rcx+16],rax
		adc		[rcx+24],r9
		ret

; =============================================================================
;                          ADD uint256 (this += 0:num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global AddU256B_x64

AddU256B_x64:
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		add		[rcx],rax
		adc		[rcx+8],r9
		
		adc		qword [rcx+16],0
		adc		qword [rcx+24],0
		ret

; =============================================================================
;                          ADD uint256 (this += 0:0:0:num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 num

global AddU256C_x64

AddU256C_x64:
		xor		rax,rax
		add		[rcx],rdx
		adc		[rcx+8],rax
		adc		[rcx+16],rax
		adc		[rcx+24],rax
		ret

; =============================================================================
;                          ADD uint256 (this = num1 + num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2

global AddU256D_x64

AddU256D_x64:
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		add		rax,[r8]
		adc		r9,[r8+8]
		mov		[rcx],rax
		mov		[rcx+8],r9

		mov		rax,[rdx+16]
		mov		r9,[rdx+24]
		adc		rax,[r8+16]
		adc		r9,[r8+24]
		mov		[rcx+16],rax
		mov		[rcx+24],r9
		ret

; =============================================================================
;                          ADD uint256 (this = num1 + 0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2

global AddU256E_x64

AddU256E_x64:
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		add		rax,[r8]
		adc		r9,[r8+8]
		mov		[rcx],rax
		mov		[rcx+8],r9

		mov		rax,[rdx+16]
		mov		r9,[rdx+24]
		adc		rax,byte 0
		adc		r9,byte 0
		mov		[rcx+16],rax
		mov		[rcx+24],r9
		ret

; =============================================================================
;                          ADD uint256 (this = num1 + 0:0:0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64 src2

global AddU256F_x64

AddU256F_x64:
		mov		r9,[rdx+8]
		add		r8,[rdx]
		adc		r9,byte 0
		mov		[rcx],r8
		mov		[rcx+8],r9

		mov		rax,[rdx+16]
		mov		r9,[rdx+24]
		adc		rax,byte 0
		adc		r9,byte 0
		mov		[rcx+16],rax
		mov		[rcx+24],r9
		ret

; =============================================================================
;                          SUB uint256 (this -= num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global SubU256A_x64

SubU256A_x64:
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		sub		[rcx],rax
		sbb		[rcx+8],r9
		
		mov		rax,[rdx+16]
		mov		r9,[rdx+24]
		sbb		[rcx+16],rax
		sbb		[rcx+24],r9
		ret

; =============================================================================
;                          SUB uint256 (this -= 0:num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global SubU256B_x64

SubU256B_x64:
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		sub		[rcx],rax
		sbb		[rcx+8],r9
		
		sbb		qword [rcx+16],0
		sbb		qword [rcx+24],0
		ret

; =============================================================================
;                          SUB uint256 (this -= 0:0:0:num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 num

global SubU256C_x64

SubU256C_x64:
		xor		rax,rax
		sub		[rcx],rdx
		sbb		[rcx+8],rax
		sbb		[rcx+16],rax
		sbb		[rcx+24],rax
		ret

; =============================================================================
;                          SUB uint256 (this = num1 - num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2

global SubU256D_x64

SubU256D_x64:
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		sub		rax,[r8]
		sbb		r9,[r8+8]
		mov		[rcx],rax
		mov		[rcx+8],r9

		mov		rax,[rdx+16]
		mov		r9,[rdx+24]
		sbb		rax,[r8+16]
		sbb		r9,[r8+24]
		mov		[rcx+16],rax
		mov		[rcx+24],r9
		ret

; =============================================================================
;                          SUB uint256 (this = num1 - 0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2

global SubU256E_x64

SubU256E_x64:
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		sub		rax,[r8]
		sbb		r9,[r8+8]
		mov		[rcx],rax
		mov		[rcx+8],r9

		mov		rax,[rdx+16]
		mov		r9,[rdx+24]
		sbb		rax,0
		sbb		r9,0
		mov		[rcx+16],rax
		mov		[rcx+24],r9
		ret

; =============================================================================
;                          SUB uint256 (this = 0:num1 - num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2

global SubU256F_x64

SubU256F_x64:
		xor		r10,r10
		xor		r11,r11
		
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		sub		rax,[r8]
		sbb		r9,[r8+8]
		mov		[rcx],rax
		mov		[rcx+8],r9

		sbb		r10,[r8+16]
		sbb		r11,[r8+24]
		mov		[rcx+16],r10
		mov		[rcx+24],r11
		ret

; =============================================================================
;                          SUB uint256 (this = num1 - 0:0:0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64 src2

global SubU256G_x64

SubU256G_x64:
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		sub		rax,r8
		sbb		r9,byte 0
		mov		[rcx],rax
		mov		[rcx+8],r9

		mov		rax,[rdx+16]
		mov		r9,[rdx+24]
		sbb		rax,byte 0
		sbb		r9,byte 0
		mov		[rcx+16],rax
		mov		[rcx+24],r9
		ret

; =============================================================================
;                          SUB uint256 (this = 0:0:0:num1 - num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 src1, r8=u64* src2

global SubU256H_x64

SubU256H_x64:
		xor		r9,r9
		xor		r10,r10
		xor		r11,r11

		sub		rdx,[r8]
		sbb		r9,[r8+8]
		mov		[rcx],rdx
		mov		[rcx+8],r9

		sbb		r10,[r8+16]
		sbb		r11,[r8+24]
		mov		[rcx+16],r10
		mov		[rcx+24],r11
		ret

; =============================================================================
;                          SUB uint256 (this = num - this)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global SubU256I_x64

SubU256I_x64:
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		sub		rax,[rcx]
		sbb		r9,[rcx+8]
		mov		[rcx],rax
		mov		[rcx+8],r9

		mov		rax,[rdx+16]
		mov		r9,[rdx+24]
		sbb		rax,[rcx+16]
		sbb		r9,[rcx+24]
		mov		[rcx+16],rax
		mov		[rcx+24],r9
		ret

; =============================================================================
;                          SUB uint256 (this = 0:num - this)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global SubU256J_x64

SubU256J_x64:
		xor		r10,r10
		xor		r11,r11

		mov		rax,[rdx]
		mov		r9,[rdx+8]
		sub		rax,[rcx]
		sbb		r9,[rcx+8]
		mov		[rcx],rax
		mov		[rcx+8],r9

		sbb		r10,[rcx+16]
		sbb		r11,[rcx+24]
		mov		[rcx+16],r10
		mov		[rcx+24],r11
		ret

; =============================================================================
;                          SUB uint256 (this = 0:0:0:num - this)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 num

global SubU256K_x64

SubU256K_x64:
		xor		r9,r9
		xor		r10,r10
		xor		r11,r11

		sub		rdx,[rcx]
		sbb		r9,[rcx+8]
		mov		[rcx],rdx
		mov		[rcx+8],r9

		sbb		r10,[rcx+16]
		sbb		r11,[rcx+24]
		mov		[rcx+16],r10
		mov		[rcx+24],r11
		ret

; =============================================================================
;                          INC uint256 (this++)
; =============================================================================
; inputs: rcx=u64* dst

global IncU256A_x64

IncU256A_x64:
		add		qword [rcx],byte 1
		adc		qword [rcx+8],byte 0
		adc		qword [rcx+16],byte 0
		adc		qword [rcx+24],byte 0
		ret

; =============================================================================
;                          INC uint256 (this = num + 1)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global IncU256B_x64

IncU256B_x64:
		mov		rax,[rdx]
		mov		r8,[rdx+8]
		add		rax,byte 1
		adc		r8,byte 0
		mov		[rcx],rax
		mov		[rcx+8],r8

		mov		rax,[rdx+16]
		mov		r8,[rdx+24]
		adc		rax,byte 0
		adc		r8,byte 0
		mov		[rcx+16],rax
		mov		[rcx+24],r8
		ret

; =============================================================================
;                          DEC uint256 (this--)
; =============================================================================
; inputs: rcx=u64* dst

global DecU256A_x64

DecU256A_x64:
		sub		qword [rcx],byte 1
		sbb		qword [rcx+8],byte 0
		sbb		qword [rcx+16],byte 0
		sbb		qword [rcx+24],byte 0
		ret

; =============================================================================
;                          DEC uint256 (this = num - 1)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global DecU256B_x64

DecU256B_x64:
		mov		rax,[rdx]
		mov		r8,[rdx+8]
		sub		rax,byte 1
		sbb		r8,byte 0
		mov		[rcx],rax
		mov		[rcx+8],r8

		mov		rax,[rdx+16]
		mov		r8,[rdx+24]
		sbb		rax,byte 0
		sbb		r8,byte 0
		mov		[rcx+16],rax
		mov		[rcx+24],r8
		ret

; =============================================================================
;                          MUL uint256 (this = num1 * num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2

global MulU256A_x64

MulU256A_x64:

; ----- small number

		mov		rax,[r8+16]
		or		rax,[r8+24]
		jz		MulU256B_x64

		mov		rax,[rdx+16]
		or		rax,[rdx+24]
		jz		MulU256B_x64_2

; ----- push registers r12, r13, r14, r15

		mov		[rsp+8],r12
		mov		[rsp+16],r13
		mov		[rsp+24],r14
		mov		[rsp+32],r15

; ----- get num1 -> r15:r14:r13:r12

		mov		r15,[rdx+24]
		mov		r14,[rdx+16]
		mov		r13,[rdx+8]
		mov		r12,[rdx]

; ----- multiply num1 * num2.N0, result -> r11:r10:r9:[rcx]

		mov		r11,[r8]

		mov		rax,r11
		mul		r12
		mov		[rcx],rax
		mov		r9,rdx
		
		mov		rax,r11
		mul		r13
		add		r9,rax
		adc		rdx,byte 0
		mov		r10,rdx
		
		mov		rax,r11
		mul		r14
		add		r10,rax
		adc		rdx,byte 0
		xchg	rax,r11
		xchg	r11,rdx
		
		mul		r15
		add		r11,rax
		
; ----- multiply num1 * num2.N1
		
		mov		r15,[r8+8]
		
		mov		rax,r15
		mul		r12
		add		r9,rax
		adc		r10,rdx
		adc		r11,byte 0
		mov		[rcx+8],r9
		
		mov		rax,r15
		mul		r13
		add		r10,rax
		adc		r11,rdx
		
		xchg	rax,r15
		mul		r14
		add		r11,rax
		
; ----- multiply num1 * num2.N2
		
		mov		r15,[r8+16]
		
		mov		rax,r15
		mul		r12
		add		r10,rax
		adc		r11,rdx
		mov		[rcx+16],r10
		
		xchg	rax,r15
		mul		r13
		add		r11,rax
		
; ----- multiply num1 * num2.N3

		mov		rax,[r8+24]
		mul		r12
		add		rax,r11
		mov		[rcx+24],rax

; ----- pop registers r12, r13, r14, r15

		mov		r15,[rsp+32]
		mov		r14,[rsp+24]
		mov		r13,[rsp+16]
		mov		r12,[rsp+8]
		ret

; =============================================================================
;                          MUL uint256 (this = num1 * 0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2

MulU256B_x64_2:

		xchg	rdx,r8

global MulU256B_x64

MulU256B_x64:

; ----- small number

		cmp		qword [r8+8],0
		je		MulU256C_x64_2
		
		mov		rax,[rdx+16]
		or		rax,[rdx+24]
		jz		MulU256D_x64

; ----- push registers r12, r13, r14, r15

		mov		[rsp+8],r12
		mov		[rsp+16],r13
		mov		[rsp+24],r14
		mov		[rsp+32],r15

; ----- get num1 -> r15:r14:r13:r12

		mov		r15,[rdx+24]
		mov		r14,[rdx+16]
		mov		r13,[rdx+8]
		mov		r12,[rdx]

; ----- multiply num1 * num2.N0, result -> r11:r10:r9:[rcx]

		mov		r11,[r8]

		mov		rax,r11
		mul		r12
		mov		[rcx],rax
		mov		r9,rdx
		
		mov		rax,r11
		mul		r13
		add		r9,rax
		adc		rdx,byte 0
		mov		r10,rdx
		
		mov		rax,r11
		mul		r14
		add		r10,rax
		adc		rdx,byte 0
		xchg	rax,r11
		xchg	r11,rdx
		
		mul		r15
		add		r11,rax
		
; ----- multiply num1 * num2.N1
		
		mov		r15,[r8+8]
		
		mov		rax,r15
		mul		r12
		add		r9,rax
		adc		r10,rdx
		adc		r11,byte 0
		mov		[rcx+8],r9
		
		mov		rax,r15
		mul		r13
		add		r10,rax
		adc		r11,rdx
		mov		[rcx+16],r10
		
		xchg	rax,r15
		mul		r14
		add		rax,r11
		mov		[rcx+24],rax
		
; ----- pop registers r12, r13, r14, r15

		mov		r15,[rsp+32]
		mov		r14,[rsp+24]
		mov		r13,[rsp+16]
		mov		r12,[rsp+8]
		ret

; =============================================================================
;                          MUL uint256 (this = num1 * 0:0:0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64 src2

MulU256C_x64_2:

		mov		r8,[r8]
		
global MulU256C_x64

MulU256C_x64:

		mov		rax,[rdx+16]
		or		rax,[rdx+24]
		jz		MulU256E_x64

		mov		r9,rdx

		mov		rax,r8
		mul		qword [r9]
		mov		[rcx],rax
		mov		r10,rdx

		mov		rax,r8
		mul		qword [r9+8]
		add		rax,r10
		adc		rdx,byte 0
		mov		[rcx+8],rax
		mov		r10,rdx

		mov		rax,r8
		mul		qword [r9+16]
		add		rax,r10
		adc		rdx,byte 0
		mov		[rcx+16],rax
		mov		r10,rdx

		xchg	rax,r8
		mul		qword [r9+24]
		add		rax,r10
		mov		[rcx+24],rax
		ret

; =============================================================================
;                          MUL uint256 (this = 0:num1 * 0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2

global MulU256D_x64

MulU256D_x64:

; ----- load num1 -> r11:r10

		mov		r11,[rdx+8]
		mov		r10,[rdx]
		test	r11,r11
		jz		MulU256E_x64_2

; ----- load num2 -> r9:r8

		mov		r9,[r8+8]
		mov		r8,[r8]
		test	r9,r9
		jz		MulU256E_x64

; ----- multiply num1.N0 * num2.N0 -> r8:[rcx]

		mov		rax,r8
		mul		r10
		mov		[rcx],rax
		xchg	rax,r8			; rax <- num2.N0
		xchg	rdx,r8			; r8 <- result HIGH
		
; ----- multiply num1.N1 * num2.N0 -> r10:r8:[rcx]

		mul		r11
		add		r8,rax
		adc		rdx,byte 0
		xchg	rdx,r10			; r10 <- result HIGH, rdx <- num1.N0
		
; ----- multiply num1.N0 * num2.N1 -> r8:r10:[rcx+8]:[rcx]
		
		mov		rax,r9			; rax <- num2.N1
		mul		rdx				; rdx = num1.N0
		add		rax,r8
		mov		[rcx+8],rax
		adc		r10,rdx			; r10 <- result HIGH
		mov		r8,0
		adc		r8,r8			; r8 <- carry

; ----- multiply num1.N1 * num2.N1
		
		xchg	rax,r9			; rax <- num2.N1
		mul		r11
		add		rax,r10
		mov		[rcx+16],rax
		adc		r8,rdx
		mov		[rcx+24],r8
		ret

; =============================================================================
;                          MUL uint256 (this = 0:num1 * 0:0:0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64 src2

MulU256E_x64_2:

		mov		rdx,r8
		mov		r8,r10		

global MulU256E_x64

MulU256E_x64:

		mov		r9,[rdx+8]
		mov		rdx,[rdx]
		test	r9,r9
		jz		MulU256F_x64

		mov		rax,r8
		mul		rdx
		mov		[rcx],rax
		mov		r10,rdx

		xor		r11,r11
		xchg	rax,r8
		mul		r9
		add		rax,r10
		adc		rdx,r11
		mov		[rcx+8],rax
		
		mov		[rcx+16],rdx
		mov		[rcx+24],r11
		ret

; =============================================================================
;                          MUL uint256 (this = 0:0:0:num1 * 0:0:0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 src1, r8=u64 src2

global MulU256F_x64

MulU256F_x64:

		xchg	rax,r8
		mul		rdx
		mov		[rcx],rax
		mov		[rcx+8],rdx
		mov		qword [rcx+16],0
		mov		qword [rcx+24],0
		ret

; =============================================================================
;                MUL uint256 with full range (this = num1 * num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2, r9=u64* high

global MulU256G_x64

MulU256G_x64:

; ----- push registers r12, r13, r14, r15, rbx, rbp, r9

		mov		[rsp+8],r12
		mov		[rsp+10h],r13
		mov		[rsp+18h],r14
		mov		[rsp+20h],r15
		push	rbx
		push	rbp
		push	r9		

; ----- get num1 -> r15:r14:r13:r12

		mov		r15,[rdx+24]
		mov		r14,[rdx+16]
		mov		r13,[rdx+8]
		mov		r12,[rdx]

; ----- multiply num1 * num2.N0, result -> r11:r10:r9:rbx:[rcx]
 ; FF FF FF FF * FF = FE FF FF FF 01 (result in 5 registers)
 
		mov		rbp,[r8]

		mov		rax,rbp
		mul		r12
		mov		[rcx],rax
		mov		rbx,rdx
		
		mov		rax,rbp
		mul		r13
		add		rbx,rax
		adc		rdx,byte 0
		mov		r9,rdx
		
		mov		rax,rbp
		mul		r14
		add		r9,rax
		adc		rdx,byte 0
		mov		r10,rdx

		mov		rax,rbp		
		mul		r15
		add		r10,rax
		adc		rdx,byte 0
		mov		r11,rdx
		
; ----- multiply num1 * num2.N1, result -> rbx:r11:r10:r9:[rcx+8]
; FE FF FF FF 01 + FE FF FF FF = FF FE FF FF 00 (result in 5 registers)

		mov		rbp,[r8+8]
		
		mov		rax,rbp
		mul		r12
		add		rbx,rax
		adc		r9,rdx
		mov		[rcx+8],rbx
		adc		r10,byte 0
		adc		r11,byte 0
		mov		rbx,0
		adc		rbx,byte 0
		
		mov		rax,rbp
		mul		r13
		add		r9,rax
		adc		r10,rdx
		adc		r11,byte 0
		adc		rbx,byte 0
		
		mov		rax,rbp
		mul		r14
		add		r10,rax
		adc		r11,rdx
		adc		rbx,byte 0

		mov		rax,rbp
		mul		r15
		add		r11,rax
		adc		rbx,rdx
		
; ----- multiply num1 * num2.N2 -> r9:rbx:r11:r10:[rcx+16]
; FE FF FF FF 01 + FF FE FF FF = FF FF FE FF 00 (result in 5 registers)
		
		mov		rbp,[r8+16]
		
		mov		rax,rbp
		mul		r12
		add		r9,rax
		adc		r10,rdx
		mov		[rcx+16],r9
		adc		r11,byte 0
		adc		rbx,byte 0
		mov		r9,0
		adc		r9,byte 0
		
		mov		rax,rbp
		mul		r13
		add		r10,rax
		adc		r11,rdx
		adc		rbx,byte 0
		adc		r9,byte 0
		
		mov		rax,rbp
		mul		r14
		add		r11,rax
		adc		rbx,rdx
		adc		r9,byte 0

		mov		rax,rbp
		mul		r15
		add		rbx,rax
		adc		r9,rdx
		
; ----- multiply num1 * num2.N3 -> r10:r9:rbx:r11:[rcx+24]
; FE FF FF FF 01 + FF FF FE FF = FF FF FF FE 00 (result in 5 registers)

		mov		rbp,[r8+24]
		
		mov		rax,rbp
		mul		r12
		add		r10,rax
		adc		r11,rdx
		mov		[rcx+24],r10
		adc		rbx,byte 0
		adc		r9,byte 0
		mov		r10,0
		adc		r10,byte 0
		
		mov		rax,rbp
		mul		r13
		add		r11,rax
		adc		rbx,rdx
		adc		r9,byte 0
		adc		r10,byte 0
		
		mov		rax,rbp
		mul		r14
		add		rbx,rax
		adc		r9,rdx
		adc		r10,byte 0

		mov		rax,rbp
		mul		r15
		add		r9,rax
		adc		r10,rdx
		
; ----- save HIGH result

		pop		rcx
		mov		[rcx],r11
		mov		[rcx+8],rbx
		mov		[rcx+16],r9
		mov		[rcx+24],r10

; ----- pop registers r12, r13, r14, r15, rbx, rbp

		pop		rbp
		pop		rbx
		mov		r15,[rsp+20h]
		mov		r14,[rsp+18h]
		mov		r13,[rsp+10h]
		mov		r12,[rsp+8]
		ret

; =============================================================================
;                          MUL10 uint256 (this = num * 10)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src
; output: rax=carry

global MulU256H_x64

MulU256H_x64:
		mov		r8,rdx
		mov		r9,10
		
		mov		rax,[r8]
		mul		r9
		mov		[rcx],rax
		mov		r10,rdx

		mov		rax,[r8+8]
		mul		r9
		add		rax,r10
		adc		rdx,byte 0
		mov		[rcx+8],rax
		mov		r10,rdx
		
		mov		rax,[r8+16]
		mul		r9
		add		rax,r10
		adc		rdx,byte 0
		mov		[rcx+16],rax
		mov		r10,rdx
		
		mov		rax,[r8+24]
		mul		r9
		add		rax,r10
		adc		rdx,byte 0
		mov		[rcx+24],rax
		
		xchg	rax,rdx
		ret

; =============================================================================
;                          SQR uint256 (this = num * num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global SqrU256A_x64

SqrU256A_x64:

; ----- small number

		mov		rax,[rdx+16]
		or		rax,[rdx+24]
		jz		SqrU256B_x64

; ----- push registers r12, r13, r14

		mov		[rsp+8],r12
		mov		[rsp+16],r13
		mov		[rsp+24],r14

; ----- get num -> r14:r13:r12:r11

		mov		r14,[rdx+24]
		mov		r13,[rdx+16]
		mov		r12,[rdx+8]
		mov		r11,[rdx]

; ----- multiply matrix diagonal -> r10:r9:r8:[rcx]

		mov		rax,r11		; N0*N0
		mul		rax
		mov		[rcx],rax
		mov		r8,rdx
		
		mov		rax,r12		; N1*N1
		mul		rax
		mov		r9,rax
		mov		r10,rdx
		
; ----- add other members 2x

		mov		rax,r11		; N0*N1
		mul		r12
		add		r8,rax
		adc		r9,rdx
		adc		r10,byte 0
		add		r8,rax
		adc		r9,rdx
		adc		r10,byte 0
		mov		[rcx+8],r8

		mov		rax,r11		; N0*N2
		mul		r13
		add		r9,rax
		adc		r10,rdx
		add		r9,rax
		adc		r10,rdx
		mov		[rcx+16],r9

		mov		rax,r11		; N0*N3
		mul		r14
		add		r10,rax
		add		r10,rax
		
		mov		rax,r12		; N1*N2
		mul		r13
		add		r10,rax
		add		r10,rax		
		mov		[rcx+24],r10

; ----- pop registers r12, r13, r14

		mov		r14,[rsp+24]
		mov		r13,[rsp+16]
		mov		r12,[rsp+8]
		ret

; =============================================================================
;                          SQR uint256 (this = 0:num * 0:num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global SqrU256B_x64

SqrU256B_x64:

; ----- load num -> r10:r9

		mov		r10,[rdx+8]
		test	r10,r10
		jz		SqrU256C_x64_2
		mov		r9,[rdx]

; ----- multiply num.N0 * num.N0 -> r8:[rcx]

		mov		rax,r9
		mul		r9
		mov		[rcx],rax
		mov		r8,rdx
		
; ----- multiply num.N1 * num.N1 -> r10:r9

		mov		rax,r10
		mul		r10
		xchg	rax,r9
		xchg	rdx,r10
		
; ----- multiply num.N0 * num.N1

		mul		rdx
		add		r8,rax
		adc		r9,rdx
		adc		r10,byte 0		
		add		r8,rax
		adc		r9,rdx
		adc		r10,byte 0		

; ----- save result

		mov		[rcx+8],r8
		mov		[rcx+16],r9
		mov		[rcx+24],r10
		ret

; =============================================================================
;                          SQR uint256 (this = 0:0:0:num1 * 0:0:0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 src

SqrU256C_x64_2:

		mov		rdx,[rdx]

global SqrU256C_x64

SqrU256C_x64:

		mov		rax,rdx
		mul		rax
		mov		[rcx],rax
		mov		[rcx+8],rdx
		mov		qword [rcx+16],0
		mov		qword [rcx+24],0
		ret

; =============================================================================
;                          DIV uint256 (this = num1 / num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2, r9=u64* rem

global DivU256A_x64

DivU256A_x64:

; ----- push registers

		mov		[rsp+8],rcx		; destination
		mov		[rsp+16],r9		; remainder
		mov		[rsp+24],rbx
		mov		[rsp+32],r15
		
		push	rsi
		push	rdi
		push	r12
		push	r13
		push	r14

; ----- local registers
; [rsp+78h] remainder
; [rsp+70h] destination
; [rsp+20h] (4*8=32) accumulator
; [rsp+0] (4*8=32) mask
; r15..r8 = dividend
; rdx:rbx:rdi:rsi = divisor
; rax = temporary
; rcx = loop counter

		sub		rsp,40h
		
; ----- prepare divisor -> rdx:rbx:rdi:rsi

		mov		rax,rdx
		mov		rsi,[r8]
		mov		rdi,[r8+8]
		mov		rbx,[r8+16]
		mov		rdx,[r8+24]

; ----- prepare dividend -> r15..r8

		mov		r8,[rax]
		mov		r9,[rax+8]
		mov		r10,[rax+16]
		mov		r11,[rax+24]
		xor		r12,r12
		xor		r13,r13
		xor		r14,r14
		xor		r15,r15

; ----- prepare accumulator -> [rsp+20h]

		xor		rax,rax			; RAX <- 0
		mov		[rsp+20h+0],rax
		mov		[rsp+20h+8],rax
		mov		[rsp+20h+16],rax
		mov		[rsp+20h+24],rax
		
; ----- prepare mask -> [rsp+0]

		mov		[rsp+0],rax
		mov		[rsp+8],rax
		mov		[rsp+16],rax
		mov		rax,8000000000000000h ; set bit 63
		mov		[rsp+24],rax
		
; ----- prepare to divide

		mov		rcx,256			; number of bits

DivU256A_x64_2:		

; ----- shift dividend left

		shl		r8,1
		rcl		r9,1
		rcl		r10,1
		rcl		r11,1
		rcl		r12,1
		rcl		r13,1
		rcl		r14,1
		rcl		r15,1
		
; ----- compare dividend with divisor

		cmp		r15,rdx
		ja		DivU256A_x64_3
		jb		DivU256A_x64_4
		
		cmp		r14,rbx
		ja		DivU256A_x64_3
		jb		DivU256A_x64_4
		
		cmp		r13,rdi
		ja		DivU256A_x64_3
		jb		DivU256A_x64_4
		
		cmp		r12,rsi
		jb		DivU256A_x64_4
		
; ----- sub divisor from dividend		
		
DivU256A_x64_3:

		sub		r12,rsi
		sbb		r13,rdi
		sbb		r14,rbx
		sbb		r15,rdx

; ----- add mask to accumulator

		mov		rax,[rsp+0]
		or		[rsp+20h+0],rax
		mov		rax,[rsp+8]
		or		[rsp+20h+8],rax
		mov		rax,[rsp+16]
		or		[rsp+20h+16],rax
		mov		rax,[rsp+24]
		or		[rsp+20h+24],rax

; ----- shift mask right

DivU256A_x64_4:

		shr		qword [rsp+24],1
		rcr		qword [rsp+16],1
		rcr		qword [rsp+8],1
		rcr		qword [rsp+0],1
		
; ----- next loop (do not use "loop" instruction, it may be too slow)

		dec		rcx
		jnz		DivU256A_x64_2

; ----- save result
		
		mov		rcx,[rsp+70h]
		mov		rax,[rsp+20h+0]
		mov		[rcx],rax
		mov		rax,[rsp+20h+8]
		mov		[rcx+8],rax
		mov		rax,[rsp+20h+16]
		mov		[rcx+16],rax
		mov		rax,[rsp+20h+24]
		mov		[rcx+24],rax

; ----- save remainder

		mov		r9,[rsp+78h]
		test	r9,r9
		jz		DivU256A_x64_9
		mov		[r9],r12
		mov		[r9+8],r13
		mov		[r9+16],r14
		mov		[r9+24],r15

; ----- pop registers

DivU256A_x64_9:

		add		rsp,40h
		
		pop		r14
		pop		r13
		pop		r12
		pop		rdi
		pop		rsi
		
		mov		r15,[rsp+32]
		mov		rbx,[rsp+24]
		ret

; =============================================================================
;                          DIV uint256 (this = num1 / 0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2, r9=u64* rem

global DivU256B_x64

DivU256B_x64:

; ----- push registers

		mov		[rsp+8],rcx		; destination
		mov		[rsp+16],r9		; remainder
		mov		[rsp+24],rbx
		mov		[rsp+32],r15
		
		push	rsi
		push	rdi
		push	r12
		push	r13
		push	r14

; ----- local registers
; [rsp+58h] remainder
; [rsp+50h] destination
; [rsp+0] (4*8=32) accumulator
; r15..r10 = dividend
; r9:r8 = divisor
; rdx:rbx:rdi:rsi = mask
; rax = temporary
; rcx = loop counter

		sub		rsp,20h
		
; ----- prepare divisor -> r9:r8

		mov		r9,[r8+8]
		mov		r8,[r8]

; ----- prepare dividend -> r15..r10

		mov		r10,[rdx]
		mov		r11,[rdx+8]
		mov		r12,[rdx+16]
		mov		r13,[rdx+24]
		xor		r14,r14
		xor		r15,r15

; ----- prepare mask -> rdx:rbx:rdi:rsi

		xor		rsi,rsi			; RSI <- 0
		xor		rdi,rdi
		xor		rbx,rbx
		mov		rdx,8000000000000000h ; set bit 63
		
; ----- prepare accumulator -> [rsp+0]

		mov		[rsp+0],rsi
		mov		[rsp+8],rsi
		mov		[rsp+16],rsi
		mov		[rsp+24],rsi
		
; ----- prepare to divide

		mov		rcx,256			; number of bits

DivU256B_x64_2:		

; ----- shift dividend left

		shl		r10,1
		rcl		r11,1
		rcl		r12,1
		rcl		r13,1
		rcl		r14,1
		rcl		r15,1
		jc		DivU256B_x64_3
		
; ----- compare dividend with divisor

		cmp		r15,r9
		ja		DivU256B_x64_3
		jb		DivU256B_x64_4
		
		cmp		r14,r8
		jb		DivU256B_x64_4
		
; ----- sub divisor from dividend		
		
DivU256B_x64_3:

		sub		r14,r8
		sbb		r15,r9

; ----- add mask to accumulator

		or		[rsp+0],rsi
		or		[rsp+8],rdi
		or		[rsp+16],rbx
		or		[rsp+24],rdx

; ----- shift mask right

DivU256B_x64_4:

		shr		rdx,1
		rcr		rbx,1
		rcr		rdi,1
		rcr		rsi,1
		
; ----- next loop (do not use "loop" instruction, it may be too slow)

		dec		rcx
		jnz		DivU256B_x64_2

; ----- save result
		
		mov		rcx,[rsp+50h]
		mov		rax,[rsp+0]
		mov		[rcx],rax
		mov		rax,[rsp+8]
		mov		[rcx+8],rax
		mov		rax,[rsp+16]
		mov		[rcx+16],rax
		mov		rax,[rsp+24]
		mov		[rcx+24],rax

; ----- save remainder

		mov		r9,[rsp+58h]
		test	r9,r9
		jz		DivU256B_x64_9
		mov		[r9],r14
		mov		[r9+8],r15
		mov		qword [r9+16],0
		mov		qword [r9+24],0

; ----- pop registers

DivU256B_x64_9:

		add		rsp,20h
		
		pop		r14
		pop		r13
		pop		r12
		pop		rdi
		pop		rsi
		
		mov		r15,[rsp+32]
		mov		rbx,[rsp+24]
		ret

; =============================================================================
;                          DIV uint256 (this = num1 / 0:0:0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64 src2
; output: rax=remainder

global DivU256C_x64

DivU256C_x64:
		mov		rax,[rdx+24]
		mov		r11,[rdx+16]
		mov		r10,[rdx+8]
		mov		r9,[rdx]
		
		xor		rdx,rdx
		div		r8
		mov		[rcx+24],rax
		
		mov		rax,r11
		div		r8
		mov		[rcx+16],rax
		
		mov		rax,r10
		div		r8
		mov		[rcx+8],rax
		
		mov		rax,r9
		div		r8
		mov		[rcx],rax
		
		mov		rax,rdx
		ret

; =============================================================================
;                          DIV uint256 (this = num1 / 0:0:0:0:0:0:0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u32 src2
; output: eax=remainder

global DivU256D_x64

DivU256D_x64:
		mov		r9,rdx
		xor		edx,edx
		
		mov		rax,[r9+24]
		mov		r10,rax
		shr		rax,32
		div		r8d
		shl		rax,32
		xchg	rax,r10
		div		r8d
		or		rax,r10
		mov		[rcx+24],rax
		
		mov		rax,[r9+16]
		mov		r10,rax
		shr		rax,32
		div		r8d
		shl		rax,32
		xchg	rax,r10
		div		r8d
		or		rax,r10
		mov		[rcx+16],rax
		
		mov		rax,[r9+8]
		mov		r10,rax
		shr		rax,32
		div		r8d
		shl		rax,32
		xchg	rax,r10
		div		r8d
		or		rax,r10
		mov		[rcx+8],rax

		mov		rax,[r9]
		mov		r10,rax
		shr		rax,32
		div		r8d
		shl		rax,32
		xchg	rax,r10
		div		r8d
		or		rax,r10
		mov		[rcx],rax

		mov		eax,edx
		ret

; =============================================================================
;    DIV uint256 (this = num1H:num1L / num2), with full range of dividend
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1L, r8=u64* src1H, r9=u64* src2, [rsp+28h]=u64* rem

global DivU256E_x64

DivU256E_x64:

; ----- push registers (5 registers, shift offset in stack by 28h)

		mov		[rsp+8],rcx		; destination
		mov		[rsp+18h],rbx
		mov		[rsp+20h],r15
		
		push	rsi
		push	rdi
		push	r12
		push	r13
		push	r14

; ----- local registers (shift offset in stack by 28h+40h=68h)
; [rsp+90h] remainder
; [rsp+70h] destination
; [rsp+20h] (4*8=32) accumulator
; [rsp+0] (4*8=32) mask
; r15..r8 = dividend
; rdx:rbx:rdi:rsi = divisor
; rax = temporary
; rcx = loop counter

		sub		rsp,40h
		
; ----- prepare divisor -> rdx:rbx:rdi:rsi

		mov		rax,rdx
		mov		rsi,[r9]
		mov		rdi,[r9+8]
		mov		rbx,[r9+16]
		mov		rdx,[r9+24]

; ----- prepare dividend -> r15..r8

		mov		r15,[r8+24]
		mov		r14,[r8+16]
		mov		r13,[r8+8]
		mov		r12,[r8]
		mov		r11,[rax+24]
		mov		r10,[rax+16]
		mov		r9,[rax+8]
		mov		r8,[rax]

; ----- prepare accumulator -> [rsp+20h]

		xor		rax,rax			; RAX <- 0
		mov		[rsp+20h+0],rax
		mov		[rsp+20h+8],rax
		mov		[rsp+20h+16],rax
		mov		[rsp+20h+24],rax
		
; ----- prepare mask -> [rsp+0]

		mov		[rsp+0],rax
		mov		[rsp+8],rax
		mov		[rsp+16],rax
		mov		rax,8000000000000000h ; set bit 63
		mov		[rsp+24],rax
		
; ----- prepare to divide

		mov		rcx,256			; number of bits

DivU256E_x64_2:		

; ----- shift dividend left

		shl		r8,1
		rcl		r9,1
		rcl		r10,1
		rcl		r11,1
		rcl		r12,1
		rcl		r13,1
		rcl		r14,1
		rcl		r15,1
		jc		DivU256E_x64_3
		
; ----- compare dividend with divisor

		cmp		r15,rdx
		ja		DivU256E_x64_3
		jb		DivU256E_x64_4
		
		cmp		r14,rbx
		ja		DivU256E_x64_3
		jb		DivU256E_x64_4
		
		cmp		r13,rdi
		ja		DivU256E_x64_3
		jb		DivU256E_x64_4
		
		cmp		r12,rsi
		jb		DivU256E_x64_4
		
; ----- sub divisor from dividend		
		
DivU256E_x64_3:

		sub		r12,rsi
		sbb		r13,rdi
		sbb		r14,rbx
		sbb		r15,rdx

; ----- add mask to accumulator

		mov		rax,[rsp+0]
		or		[rsp+20h+0],rax
		mov		rax,[rsp+8]
		or		[rsp+20h+8],rax
		mov		rax,[rsp+16]
		or		[rsp+20h+16],rax
		mov		rax,[rsp+24]
		or		[rsp+20h+24],rax

; ----- shift mask right

DivU256E_x64_4:

		shr		qword [rsp+24],1
		rcr		qword [rsp+16],1
		rcr		qword [rsp+8],1
		rcr		qword [rsp+0],1
		
; ----- next loop (do not use "loop" instruction, it may be too slow)

		dec		rcx
		jnz		DivU256E_x64_2

; ----- save result
		
		mov		rcx,[rsp+70h]
		mov		rax,[rsp+20h+0]
		mov		[rcx],rax
		mov		rax,[rsp+20h+8]
		mov		[rcx+8],rax
		mov		rax,[rsp+20h+16]
		mov		[rcx+16],rax
		mov		rax,[rsp+20h+24]
		mov		[rcx+24],rax

; ----- save remainder

		mov		r9,[rsp+90h]
		test	r9,r9
		jz		DivU256E_x64_9
		mov		[r9],r12
		mov		[r9+8],r13
		mov		[r9+16],r14
		mov		[r9+24],r15

; ----- pop registers

DivU256E_x64_9:

		add		rsp,40h
		
		pop		r14
		pop		r13
		pop		r12
		pop		rdi
		pop		rsi
		
		mov		r15,[rsp+20h]
		mov		rbx,[rsp+18h]
		ret

; =============================================================================
;                          DIV10 uint256 (this = num / 10)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src
; output: eax=remainder

global DivU256F_x64

DivU256F_x64:

		; push registers
		mov		[rsp+8],r12

		; get number -> r12:r11:r10:r9
		mov		r12,[rdx+24] ; r12 = N3
		mov		r11,[rdx+16] ; r11 = N2
		mov		r10,[rdx+8]	; r10 = N1
		mov		r9,[rdx]	; r9 = N0
		mov		r8,0cccccccccccccccdh ; r8 = 1/10

; N3
		; N3/10 -> rdx
		mov		rax,r8		; rax <- 1/10
		mul		r12			; mul by N3
		shr		rdx,3
		mov		[rcx+24],rdx ; result N3

		; remainder N3 % 10 -> r12
		lea		rax,[rdx+rdx*4] ; rax <- result N3 * 5
		add		rax,rax		; rax <- result N3 * 10
		sub		r12,rax		; r12 <- remainder 0..9

; N2
		; N2H/10 -> r12, rdx
		mov		rax,r11		; rax <- N2
		shr		rax,32		; rax <- N2H
		shl		r12,32		; r12 <- shift remainder to high position
		or		rax,r12		; rax <- remainder:N2H
		mul		r8
		shr		rdx,3		; rdx <- result N2H/10
		mov		r12,rdx
		shl		r12,32		; r12 <- save result into high position

		; remainder N2H % 10 -> rdx
		lea		rax,[rdx+rdx*4] ; rax <- result N2H * 5
		add		rax,rax		; rax <- result N2H * 10
		mov		rdx,r11		; rdx <- N2
		shr		rdx,32		; rdx <- N2H
		sub		rdx,rax		; rdx <- remainder
		
		; N2L/10 -> r12, rdx
		mov		eax,r11d	; rax <- N2L
		shl		rdx,32		; rdx <- shift remainder to high position
		or		rax,rdx		; rax <- remainder:N2L
		mul		r8
		shr		rdx,3		; rdx <- result N2L/10
		or		r12,rdx		; r10 <- result of N0
		mov		[rcx+16],r12 ; save result N2

		; remainder N2L % 10 -> rdx
		lea		rax,[rdx+rdx*4] ; rax <- result N2L * 5
		add		rax,rax		; rax <- result N2L * 10
		mov		edx,r11d	; rdx <- N2L
		sub		edx,eax		; rdx <- remainder

; N1
		; N1H/10 -> r11, rdx
		mov		rax,r10		; rax <- N1
		shr		rax,32		; rax <- N1H
		shl		rdx,32		; r10 <- shift remainder to high position
		or		rax,rdx		; rax <- remainder:N1H
		mul		r8
		shr		rdx,3		; rdx <- result N1H/10
		mov		r11,rdx
		shl		r11,32		; r11 <- save result into high position

		; remainder N1H % 10 -> rdx
		lea		rax,[rdx+rdx*4] ; rax <- result N1H * 5
		add		rax,rax		; rax <- result N1H * 10
		mov		rdx,r10		; rdx <- N1
		shr		rdx,32		; rdx <- N1H
		sub		rdx,rax		; rdx <- remainder
		
		; N1L/10 -> r11, rdx
		mov		eax,r10d	; rax <- N1L
		shl		rdx,32		; rdx <- shift remainder to high position
		or		rax,rdx		; rax <- remainder:N1L
		mul		r8
		shr		rdx,3		; rdx <- result N1L/10
		or		r11,rdx		; r11 <- result of N1
		mov		[rcx+8],r11	; save result N1

		; remainder N1L % 10 -> rdx
		lea		rax,[rdx+rdx*4] ; rax <- result N1L * 5
		add		rax,rax		; rax <- result N1L * 10
		mov		edx,r10d	; rdx <- N1L
		sub		edx,eax		; rdx <- remainder

; N0
		; N0H/10 -> r10, rdx
		mov		rax,r9		; rax <- N0
		shr		rax,32		; rax <- N0H
		shl		rdx,32		; r10 <- shift remainder to high position
		or		rax,rdx		; rax <- remainder:N0H
		mul		r8
		shr		rdx,3		; rdx <- result N0H/10
		mov		r10,rdx
		shl		r10,32		; r10 <- save result into high position

		; remainder N0H % 10 -> rdx
		lea		rax,[rdx+rdx*4] ; rax <- result N0H * 5
		add		rax,rax		; rax <- result N0H * 10
		mov		rdx,r9		; rdx <- N0
		shr		rdx,32		; rdx <- N0H
		sub		rdx,rax		; rdx <- remainder
		
		; N0L/10 -> r10, rdx
		mov		eax,r9d	; rax <- N0L
		shl		rdx,32		; rdx <- shift remainder to high position
		or		rax,rdx		; rax <- remainder:N0L
		mul		r8
		shr		rdx,3		; rdx <- result N0L/10
		or		r10,rdx		; r10 <- result of N0
		mov		[rcx],r10	; save result N0

		; remainder N0L % 10 -> rdx
		lea		rax,[rdx+rdx*4] ; rax <- result N0L * 5
		add		rax,rax		; rax <- result N0L * 10
		mov		edx,r9d	; rdx <- N0L
		sub		edx,eax		; rdx <- remainder

		mov		eax,edx

		; pop registers
		mov		r12,[rsp+8]
		ret

; =============================================================================
;                          NEG uint256 (this = -this)
; =============================================================================
; inputs: rcx=u64* dst

global NegU256A_x64

NegU256A_x64:
		xor		rax,rax
		xor		r9,r9
		xor		r10,r10
		xor		r11,r11
		
		sub		rax,[rcx]
		sbb		r9,[rcx+8]
		sbb		r10,[rcx+16]
		sbb		r11,[rcx+24]
		
		mov		[rcx],rax
		mov		[rcx+8],r9
		mov		[rcx+16],r10
		mov		[rcx+24],r11
		ret

; =============================================================================
;                          NEG uint256 (this = -num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global NegU256B_x64

NegU256B_x64:
		xor		rax,rax
		xor		r9,r9
		xor		r10,r10
		xor		r11,r11
		
		sub		rax,[rdx]
		sbb		r9,[rdx+8]
		sbb		r10,[rdx+16]
		sbb		r11,[rdx+24]
		
		mov		[rcx],rax
		mov		[rcx+8],r9
		mov		[rcx+16],r10
		mov		[rcx+24],r11
		ret

; =============================================================================
;                          NEG uint256 (this = -0:num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global NegU256C_x64

NegU256C_x64:
		xor		rax,rax
		xor		r9,r9
		xor		r10,r10
		xor		r11,r11
		
		sub		rax,[rdx]
		sbb		r9,[rdx+8]
		sbb		r10,r10
		sbb		r11,r11
		
		mov		[rcx],rax
		mov		[rcx+8],r9
		mov		[rcx+16],r10
		mov		[rcx+24],r11
		ret

; =============================================================================
;                          NEG uint256 (this = -0:0:0:num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 src

global NegU256D_x64

NegU256D_x64:
		xor		rax,rax
		xor		r9,r9
		xor		r10,r10
		
		neg		rdx
		sbb		rax,rax
		sbb		r9,r9
		sbb		r10,r10
		
		mov		[rcx],rdx
		mov		[rcx+8],rax
		mov		[rcx+16],r9
		mov		[rcx+24],r10
		ret

; =============================================================================
;                              BITS uint256
; =============================================================================
; inputs: rcx=u64* dst
; output: rax=bits

global BitsU256_x64

BitsU256_x64:
		bsr		rax,qword [rcx+3*8]
		jz		BitsU256_x64_2
		lea		rax,[rax+3*64+1]
		ret		
		
BitsU256_x64_2:
		bsr		rax,qword [rcx+2*8]
		jz		BitsU256_x64_3
		lea		rax,[rax+2*64+1]
		ret
		
BitsU256_x64_3:
		bsr		rax,qword [rcx+8]
		jz		BitsU256_x64_4
		lea		rax,[rax+64+1]
		ret		
		
BitsU256_x64_4:
		bsr		rax,qword [rcx]
		jz		BitsU256_x64_5
		lea		rax,[rax+1]
		ret
		
BitsU256_x64_5:
		xor		rax,rax
		ret	
